library(ggplot2)
library(dplyr)
library(reshape2)
library(fst)

#load sibling data
load("all_sibs.rdata")
#load(paste(work_data, "projects/who_becomes_dk/data/desc_comp_desc.rdata", sep = ""))
load("main_data_local.rdata")
load("main_data_parl.rdata")

#create function for splitting income residuals in brackets

stacker <- 
  function(x){
    stack <- NA
    stack[x >=-3.25] <- -3
    stack[x > -2.75] <- -2.5
    stack[x > -2.25] <- -2
    stack[x > -1.75] <- -1.5
    stack[x > -1.25] <- -1
    stack[x > - .75] <- -.5
    stack[x > - .25] <- 0
    stack[x >   .25] <- .5
    stack[x >   .75] <- 1
    stack[x >  1.25] <- 1.5
    stack[x >  1.75] <- 2
    stack[x >  2.25] <- 2.5
    stack[x >  2.75] <- 3
    return(stack)
  }


kv_years <- seq(1997, 2013, 4)
fv_years <- c(1994, 1998, 2001, 2005, 2007, 2011, 2015)
mod_ests <- tibble()

bar_data <- data_frame()

for(k in kv_years){
  # remove bottom 0.1% and top 0.1% to avoid huge outliers. 
  # rescale income residual 
  # remove abs(z) > 3.25
  
  data_an <- 
    data_comp_local %>% 
    filter(four_years == k) %>% 
    filter(in_year == 1) %>% 
    filter(inc_res < quantile(inc_res, p = 0.999) &
             inc_res > quantile(inc_res, p = 0.001) ) %>% 
    filter(first_elected == 0 | first_elected >= k) %>% 
    mutate(inc_res2 = (inc_res - mean(inc_res)) / sd(inc_res)) %>%
    filter(abs(inc_res2) <= 3.25) %>% 
    mutate(inc_res_bars = stacker(inc_res2))
  
  full_sib$quant_year <- unlist(full_sib[,paste("run_kv_", k, sep = "")])
  
  sibs_year <- 
    as.data.frame(full_sib) %>% 
    group_by(fe_sibship) %>% 
    summarise(pol_fam = sum(quant_year))
  
  sibs_year <- 
    sibs_year %>% 
    filter(pol_fam > 0) 
  
  sibs_pol <- 
    full_sib %>% 
    filter(fe_sibship %in% sibs_year$fe_sibship)
  
  data_an <-
    data_an %>% 
    filter(PNR %in% sibs_pol$PNR) %>%
    left_join(., sibs_pol %>% 
                select(c("PNR", "fe_sibship")), 
              by = "PNR")
  
  # group by z categories for running candidates and their sibling
  plot_data <- 
    data_frame(type = "local candidate",
               z = seq(-3, 3, 0.5), 
               year = k) %>%
    left_join(., data_an %>% 
                group_by(inc_res_bars) %>% 
                summarize(population_margin = n()), by =  c("z" = "inc_res_bars")) %>% 
    left_join(., data_an %>%
                filter(run_kv == 1) %>%
                group_by(inc_res_bars) %>% 
                summarize(politician_margin = n()), by =  c("z" = "inc_res_bars")) %>% 
    left_join(., data_an %>%
                filter(run_kv == 0) %>%
                group_by(inc_res_bars) %>% 
                summarize(sibling = n()), by =  c("z" = "inc_res_bars"))
  
  bar_data <-
    bar_data %>%
    bind_rows(plot_data)  
  
  mod <- lm(inc_res ~ run_kv + as.factor(fe_sibship), data = data_an)
  
  mod_out <- tibble(level = "Local Council",
                    comparison = "Running",
                    year = k, 
                    est = summary(mod)$coefficients[2,1],
                    se = summary(mod)$coefficients[2,2])
  
  mod_ests <- 
    bind_rows(mod_ests, mod_out)  
  
  print(k)
  print(Sys.time())
}


# repeat for elected for local gov ----------------------------------------

for(k in kv_years){
  # remove bottom 0.1% and top 0.1% to avoid huge outliers. 
  # rescale income residual 
  # remove abs(z) > 3.25
  
  data_an <- 
    data_comp_local %>% 
    filter(four_years == k) %>% 
    filter(in_year == 1) %>% 
    filter(inc_res < quantile(inc_res, p = 0.999) &
             inc_res > quantile(inc_res, p = 0.001) ) %>% 
    mutate(inc_res2 = (inc_res - mean(inc_res)) / sd(inc_res)) %>%
    filter(first_elected == 0 | first_elected >= k) %>% 
    filter(abs(inc_res2) <= 3.25) %>% 
    mutate(inc_res_bars = stacker(inc_res2))
  
  full_sib$quant_year <- unlist(full_sib[,paste("elected_kv_", k, sep = "")])
  
  sibs_year <- 
    as.data.frame(full_sib) %>% 
    group_by(fe_sibship) %>% 
    summarise(pol_fam = sum(quant_year))
  
  sibs_year <- 
    sibs_year %>% 
    filter(pol_fam > 0) 
  
  sibs_pol <- 
    full_sib %>% 
    filter(fe_sibship %in% sibs_year$fe_sibship)
  
  data_an <-
    data_an %>% 
    filter(PNR %in% sibs_pol$PNR) %>%
    left_join(., sibs_pol %>% 
                select(c("PNR", "fe_sibship")), 
              by = "PNR")
  
  # group by z categories for running candidates and their sibling
  plot_data <- 
    data_frame(type = "local winner",
               z = seq(-3, 3, 0.5), 
               year = k) %>%
    left_join(., data_an %>% 
                group_by(inc_res_bars) %>% 
                summarize(population_margin = n()), by =  c("z" = "inc_res_bars")) %>% 
    left_join(., data_an %>%
                filter(run_kv == 1) %>%
                group_by(inc_res_bars) %>% 
                summarize(politician_margin = n()), by =  c("z" = "inc_res_bars")) %>% 
    left_join(., data_an %>%
                filter(run_kv == 0) %>%
                group_by(inc_res_bars) %>% 
                summarize(sibling = n()), by =  c("z" = "inc_res_bars"))
  
  bar_data <-
    bar_data %>%
    bind_rows(plot_data)  
  
  mod <- lm(inc_res ~ run_kv + as.factor(fe_sibship), data = data_an)
  
  mod_out <- tibble(level = "Local Council",
                    comparison = "Elected",
                    year = k, 
                    est = summary(mod)$coefficients[2,1],
                    se = summary(mod)$coefficients[2,2])
  
  mod_ests <- 
    bind_rows(mod_ests, mod_out)  
  
  print(k)
  print(Sys.time())
}

bar_data_fv <- data_frame()

for(k in fv_years){
  # remove bottom 0.1% and top 0.1% to avoid huge outliers. 
  # rescale income residual 
  # remove abs(z) > 3.25
  
  data_an <- 
    data_comp_parl %>% 
    filter(four_years == k) %>% 
    filter(in_year == 1) %>% 
    filter(inc_res < quantile(inc_res, p = 0.999) &
             inc_res > quantile(inc_res, p = 0.001) ) %>% 
    filter(first_elected == 0 | first_elected >= k) %>% 
    mutate(inc_res2 = (inc_res - mean(inc_res)) / sd(inc_res)) %>%
    filter(abs(inc_res2) <= 3.25) %>% 
    mutate(inc_res_bars = stacker(inc_res2))
  
  full_sib$quant_year <- unlist(full_sib[,paste("run_fv_", k, "_FV", sep = "")])
  
  sibs_year <- 
    as.data.frame(full_sib) %>% 
    group_by(fe_sibship) %>% 
    summarise(pol_fam = sum(quant_year))
  
  sibs_year <- 
    sibs_year %>% 
    filter(pol_fam > 0) 
  
  sibs_pol <- 
    full_sib %>% 
    filter(fe_sibship %in% sibs_year$fe_sibship)
  
  data_an <-
    data_an %>% 
    filter(PNR %in% sibs_pol$PNR) %>%
    left_join(., sibs_pol %>% 
                select(c("PNR", "fe_sibship")), 
              by = "PNR")
  
  
  # group by z categories for running candidates and their sibling
  plot_data <- 
    data_frame(type = "parliament candidate",
               z = seq(-3, 3, 0.5), 
               year = k) %>%
    left_join(., data_an %>% 
                group_by(inc_res_bars) %>% 
                summarize(population_margin = n()), by =  c("z" = "inc_res_bars")) %>% 
    left_join(., data_an %>%
                filter(run_fv == 1) %>%
                group_by(inc_res_bars) %>% 
                summarize(politician_margin = n()), by =  c("z" = "inc_res_bars")) %>% 
    left_join(., data_an %>%
                filter(run_fv == 0) %>%
                group_by(inc_res_bars) %>% 
                summarize(sibling = n()), by =  c("z" = "inc_res_bars")) %>%
    mutate(population_margin = ifelse(is.na(population_margin), 0, population_margin),
           politician_margin = ifelse(is.na(politician_margin), 0, politician_margin),
           sibling = ifelse(is.na(sibling), 0, sibling))
  
  bar_data_fv <-
    bar_data_fv %>%
    bind_rows(plot_data)  
  
  print(k)
  print(Sys.time())
  
  mod <- lm(inc_res ~ run_fv + as.factor(fe_sibship), data = data_an)

  mod_out <- tibble(level = "Parliament",
                    comparison = "Running",
                    year = k, 
                    est = summary(mod)$coefficients[2,1],
                    se = summary(mod)$coefficients[2,2])
  
  mod_ests <- 
    bind_rows(mod_ests, mod_out)
}


# repeat for elected for MPs ----------------------------------------

for(k in fv_years){
  # remove bottom 0.1% and top 0.1% to avoid huge outliers. 
  # rescale income residual 
  # remove abs(z) > 3.25
  
  data_an <- 
    data_comp_parl %>% 
    filter(four_years == k) %>% 
    filter(in_year == 1) %>% 
    filter(inc_res < quantile(inc_res, p = 0.999) &
             inc_res > quantile(inc_res, p = 0.001) ) %>% 
    filter(first_elected == 0 | first_elected >= k) %>% 
    mutate(inc_res2 = (inc_res - mean(inc_res)) / sd(inc_res)) %>%
    filter(abs(inc_res2) <= 3.25) %>% 
    mutate(inc_res_bars = stacker(inc_res2))
  
  full_sib$quant_year <- unlist(full_sib[,paste("elected_fv_", k, sep = "")])
  
  sibs_year <- 
    as.data.frame(full_sib) %>% 
    group_by(fe_sibship) %>% 
    summarise(pol_fam = sum(quant_year))
  
  sibs_year <- 
    sibs_year %>% 
    filter(pol_fam > 0) 
  
  sibs_pol <- 
    full_sib %>% 
    filter(fe_sibship %in% sibs_year$fe_sibship)
  
  data_an <-
    data_an %>% 
    filter(PNR %in% sibs_pol$PNR) %>%
    left_join(., sibs_pol %>% 
                select(c("PNR", "fe_sibship")), 
              by = "PNR")
  
  # group by z categories for running candidates and their sibling
  plot_data <- 
    data_frame(type = "parliament winner",
               z = seq(-3, 3, 0.5), 
               year = k) %>%
    left_join(., data_an %>% 
                group_by(inc_res_bars) %>% 
                summarize(population_margin = n()), by =  c("z" = "inc_res_bars")) %>% 
    left_join(., data_an %>%
                filter(run_fv == 1) %>%
                group_by(inc_res_bars) %>% 
                summarize(politician_margin = n()), by =  c("z" = "inc_res_bars")) %>% 
    left_join(., data_an %>%
                filter(run_fv == 0) %>%
                group_by(inc_res_bars) %>% 
                summarize(sibling = n()), by =  c("z" = "inc_res_bars")) %>%
    mutate(population_margin = ifelse(is.na(population_margin), 0, population_margin),
           politician_margin = ifelse(is.na(politician_margin), 0, politician_margin),
           sibling = ifelse(is.na(sibling), 0, sibling))
  
  bar_data_fv <-
    bar_data_fv %>%
    bind_rows(plot_data)  
  
  mod <- lm(inc_res ~ run_fv + as.factor(fe_sibship), data = data_an)
  
  mod_out <- tibble(level = "Parliament",
                    comparison = "Elected",
                    year = k, 
                    est = summary(mod)$coefficients[2,1],
                    se = summary(mod)$coefficients[2,2])
  
  mod_ests <- 
    bind_rows(mod_ests, mod_out)
  
  print(k)
  print(Sys.time())
}
# create plots 

bar_data <-
  as.data.frame(bar_data)

bar_data2 <- 
  melt(bar_data, 
       id = c("type", "z", "year")) %>%
  group_by(type, z, variable) %>% 
  summarise(count = sum(value)) 

# repeat for fv data
bar_data2_fv <- 
  as.data.frame(bar_data_fv) %>% 
  melt(., 
       id = c("type", "z", "year")) %>%
  group_by(type, z, variable) %>% 
  summarise(count = sum(value)) 

suppressWarnings({
  bar_data2 <- 
    bind_rows(bar_data2, bar_data2_fv) %>% 
    group_by(type, z, variable) %>% 
    summarise(count = sum(count)) 
})

bar_data_count <- 
  bar_data2 %>% 
  group_by(type, variable) %>% 
  summarise(total =  sum(count))

bar_data2 <- 
  left_join(bar_data2, bar_data_count, by = c("type", "variable")) %>% 
  mutate(share = count / total)

# change order of factor

bar_data2 <- 
  bar_data2 %>%
  ungroup() %>%
  mutate(variable = factor(variable, levels = levels(variable)[c(1,3,2)]),
         type = as.factor(c(rep("Running for \nmunicipality", 39),
                  rep("Elected for \nmunicipality", 39),
                  rep("Running for \nparliament"  , 39), 
                  rep("Elected for \nparliament"  , 39))),
         type = factor(type, levels = levels(type)[c(3,1,4,2)]))

# create plot

plot <-
  ggplot(data = bar_data2 %>% filter(variable != "population_margin"),
         aes(x = z, 
             y = share, 
             alpha = variable) ) +
  facet_grid(type ~ .) +
  geom_bar(stat = "identity", position = "dodge" ) +
  theme_classic() + 
  scale_alpha_discrete(range = c(0.2, 1), "", 
                       labels = c("Sibling", 
                                  "Politician")) + 
  scale_x_continuous("", breaks = -3:3) + 
  scale_y_continuous("") + 
  theme(legend.position = "top")

plot_data <- 
  bar_data2

# add population data from main plot
load("mincer_comp_data.rdata")

plot <- 
  plot + 
  geom_bar(data = bar_data2 %>% filter(variable == "population_margin"),
           aes(x = z, 
               y = share), 
           stat = "identity", fill = "white", colour = "black", alpha = 1/10 ) 